{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>passenger_count</th>\n",
       "      <th>trip_distance</th>\n",
       "      <th>total_amount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>9.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2.6</td>\n",
       "      <td>16.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>55.55</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   passenger_count  trip_distance  total_amount\n",
       "0                1            1.5          9.95\n",
       "1                1            2.6         16.30\n",
       "2                3            0.0          5.80\n",
       "3                5            0.0          7.55\n",
       "4                5            0.0         55.55"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/nyc_taxi_2019-01.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                usecols=['passenger_count',\n",
    "                         'trip_distance', 'total_amount'],\n",
    "                dtype={'total_amount':np.float128})\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 1\n",
    "\n",
    "In which five rides did people pay the most per mile?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>passenger_count</th>\n",
       "      <th>trip_distance</th>\n",
       "      <th>total_amount</th>\n",
       "      <th>cost_per_mile</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4136499</th>\n",
       "      <td>1</td>\n",
       "      <td>0.01</td>\n",
       "      <td>273.96</td>\n",
       "      <td>27396.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6403254</th>\n",
       "      <td>1</td>\n",
       "      <td>0.01</td>\n",
       "      <td>322.30</td>\n",
       "      <td>32230.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7099014</th>\n",
       "      <td>4</td>\n",
       "      <td>0.01</td>\n",
       "      <td>415.30</td>\n",
       "      <td>41530.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>478791</th>\n",
       "      <td>1</td>\n",
       "      <td>0.10</td>\n",
       "      <td>6667.45</td>\n",
       "      <td>66674.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2499600</th>\n",
       "      <td>1</td>\n",
       "      <td>2.40</td>\n",
       "      <td>623261.66</td>\n",
       "      <td>259692.358333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         passenger_count  trip_distance  total_amount  cost_per_mile\n",
       "4136499                1           0.01        273.96   27396.000000\n",
       "6403254                1           0.01        322.30   32230.000000\n",
       "7099014                4           0.01        415.30   41530.000000\n",
       "478791                 1           0.10       6667.45   66674.500000\n",
       "2499600                1           2.40     623261.66  259692.358333"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# First, remove 0-length trips.\n",
    "df = df[df['trip_distance'] != 0]\n",
    "\n",
    "# Create a new column, in which we calculate the cost per mile\n",
    "df['cost_per_mile'] = df['total_amount'] / df['trip_distance']\n",
    "\n",
    "# Now sort the data frame by that column, and get the 5 highest values\n",
    "df.sort_values('cost_per_mile').tail(5)\n",
    "\n",
    "# Obviously, the data is a bit messed up, given that some trips went 0.01 \n",
    "# miles and at least one trip cost $623,261!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 2\n",
    "\n",
    "Let's assume that multi-passenger rides are split evenly among the passengers. Given that assumption, in which 10 rides did each individual pay the greatest amount? And again, how far did they travel?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>passenger_count</th>\n",
       "      <th>trip_distance</th>\n",
       "      <th>total_amount</th>\n",
       "      <th>cost_per_mile</th>\n",
       "      <th>payment_per_person</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5031491</th>\n",
       "      <td>2</td>\n",
       "      <td>64.30</td>\n",
       "      <td>343.32</td>\n",
       "      <td>5.339347</td>\n",
       "      <td>171.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4563340</th>\n",
       "      <td>2</td>\n",
       "      <td>0.40</td>\n",
       "      <td>350.30</td>\n",
       "      <td>875.750000</td>\n",
       "      <td>175.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4202883</th>\n",
       "      <td>2</td>\n",
       "      <td>60.23</td>\n",
       "      <td>369.06</td>\n",
       "      <td>6.127511</td>\n",
       "      <td>184.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4751745</th>\n",
       "      <td>2</td>\n",
       "      <td>100.78</td>\n",
       "      <td>403.50</td>\n",
       "      <td>4.003771</td>\n",
       "      <td>201.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5726185</th>\n",
       "      <td>2</td>\n",
       "      <td>65.05</td>\n",
       "      <td>416.82</td>\n",
       "      <td>6.407686</td>\n",
       "      <td>208.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149362</th>\n",
       "      <td>2</td>\n",
       "      <td>17.20</td>\n",
       "      <td>426.80</td>\n",
       "      <td>24.813953</td>\n",
       "      <td>213.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7593395</th>\n",
       "      <td>2</td>\n",
       "      <td>83.61</td>\n",
       "      <td>449.32</td>\n",
       "      <td>5.373998</td>\n",
       "      <td>224.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3842620</th>\n",
       "      <td>2</td>\n",
       "      <td>110.04</td>\n",
       "      <td>515.82</td>\n",
       "      <td>4.687568</td>\n",
       "      <td>257.91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3014027</th>\n",
       "      <td>2</td>\n",
       "      <td>16.60</td>\n",
       "      <td>560.76</td>\n",
       "      <td>33.780723</td>\n",
       "      <td>280.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2972145</th>\n",
       "      <td>2</td>\n",
       "      <td>19.90</td>\n",
       "      <td>589.96</td>\n",
       "      <td>29.646231</td>\n",
       "      <td>294.98</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         passenger_count  trip_distance  total_amount  cost_per_mile  \\\n",
       "5031491                2          64.30        343.32       5.339347   \n",
       "4563340                2           0.40        350.30     875.750000   \n",
       "4202883                2          60.23        369.06       6.127511   \n",
       "4751745                2         100.78        403.50       4.003771   \n",
       "5726185                2          65.05        416.82       6.407686   \n",
       "149362                 2          17.20        426.80      24.813953   \n",
       "7593395                2          83.61        449.32       5.373998   \n",
       "3842620                2         110.04        515.82       4.687568   \n",
       "3014027                2          16.60        560.76      33.780723   \n",
       "2972145                2          19.90        589.96      29.646231   \n",
       "\n",
       "         payment_per_person  \n",
       "5031491              171.66  \n",
       "4563340              175.15  \n",
       "4202883              184.53  \n",
       "4751745              201.75  \n",
       "5726185              208.41  \n",
       "149362               213.40  \n",
       "7593395              224.66  \n",
       "3842620              257.91  \n",
       "3014027              280.38  \n",
       "2972145              294.98  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove trips with <2 passengers\n",
    "df = df[df['passenger_count'] >= 2]\n",
    "\n",
    "# Create a new column based on these values\n",
    "df['payment_per_person'] = df['total_amount'] / df['passenger_count']\n",
    "\n",
    "# Find the highest per-person payment\n",
    "df.sort_values('payment_per_person').tail(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 3\n",
    "\n",
    "In the exercise solution, I showed that we needed to use `iloc` or `head`/`tail` to retrieve the first/last 20 rows, because the index was all scrambled after our sort operation. But you can pass `ignore_index=True` to `sort_values`, and then the resulting data frame will have a numeric index, starting at 0. Use this option, and `loc`, to get the mean `total_amount` for the 20 longest trips."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "253.65904761904761955"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_values('trip_distance',\n",
    "                ascending=False,\n",
    "              ignore_index=True)['total_amount'].loc[:20].mean()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
